Skunkware 5

home *** CD-ROM | disk | FTP | other *** search

/ Skunkware 5 / Skunkware 5.iso / src / Tools / freeWAIS-sf-1.1 / kidsf1.1.pl < prev next >

Wrap

Perl Script | 1995-07-27 | 23KB | 492 lines

#!/usr/skunk/bin/perl # kidofwais.pl -- WAIS search interface # (derived from sonofwais.pl, derived from wais.pl) # # a url of "http://your.www.server/cgibindir/kidofwais.pl" will search the # default WAIS index; a url of the form: # "http://your.www.server/cgibindir/kidofwais.pl/another_wais_src" # will search the WAIS index called "another_wais_src". # ----------- wais.pl and derivatives history: # Tony Sanders <sanders@bsdi.com>, Nov 1993 Original author of wais.pl # # by Eric Lease Morgan, NCSU Libraries, April 1994 (sonofwais.pl) # Modified to present the user "human-readable" titles, better instructions as # well as the ability to do repeated searches after receiving results. # eric_morgan@ncsu.edu # http://dewey.lib.ncsu.edu/staff/morgan/morgan.html # To read more about this script try: # http://dewey.lib.ncsu.edu/staff/morgan/son-of-wais.html # NOTE: the "headline" code only works right if you are using the "-t URL" # option when you create the WAIS index using "waisindex" # ---- Mike Grady, UIUC/CCSO, m-grady@uiuc.edu, http://ewshp2.cso.uiuc.edu/ # further modified by Mike Grady, UIUC/CCSO, June 1994 -- changed title routine # to use some combination of original here and version from Sean Dowd, EDS # (that version referred to as wais-ncsa-httpd.pl). Also used file typing idea # from Sean's version and Kevin Hughes's (EIT) wwwwais.c. # New: Added a hiliting option that works in conjunction with a second cgi # script called "print_hit_bold.pl". It attempts to hilite the searchterms # which occur in the document, and to place an "anchor" just before the # first occurrence so that the document is positioned to there when it # is displayed (at least this works with NCSA's HTTPD and Mosaic). # Seeing the Encyclopedia Britannica's stuff got me to thinking how I # might do this. # # also added support for Multitype (multiformat) query responses (using the # option "-M TYPE1,TYPE2,..." when waisindexing). Waisindex blows up on me # if I try to use the "-M" option AND the "-t URL" option. And if I tried # adding a -T option (at least with TEXT), the "headline" ended up being just # the filename without any path info. So don't use "-t" OR "-T" in conjunction # with "-M" (the important thing is: make sure the headline field in the # waisindex src ".cat" file has the form: "filename /path/to/file/" ). # The alternate forms of the file available are listed following the "Score # line. Marc Andreeson's "Mosaic And Wais Tutorial" discusses Multiformat # indexing at "http://wintermute.ncsa.uiuc.edu:8080/wais-tutorial/wais.html". # 6/29/94 -- Now have added in support for optional "title tables", both for # indices themselves (used as title on "search" page), and for filetypes # that don't lend themselves to extracting a nice title to display on the # hit list (in my case, pdf type files). # 6/30/94 -- Added support for searching multiple indices with one search. This # uses a "Source table" which serves as both a title table for the indices (as # mentioned above), and as a "process control table". See the comments in the # sample table for more description. This is all "turned on" by setting the # the flag $use_Source_table to "1"; if you set it to "0", no table will be # expected or used. # 7/20/94 -- Few minor changes in search "numbers" display: removed "lines" # since it is often meaningless, changed "bytes" to be rounded to nearest kbyte. # Small changes to <TITLE> processing in &extractTitle. # 8/4/94 -- fixed a bug with multi-index code, added DEBUG option # 8/15/94 -- fixed a bug: occasionally WAIS returns two "odd" files that are # information from index.src and index.cat -- exclude these files from list # do we need to debug the returned lines from waisq? If so, write into log. $DEBUG = 0; # set to 1 to turn on debugging code; set next to logfile $debugLOG = "/usr/httpd/logs/kidsf.log"; # where is your waiq binary? $waisq = "/usr/local/bin/waisqsf1.1"; # where are your source files? $waisd = "/usr/wais/wais-sources"; # what database do you want to search? (Default) $default_src = "gruntsco"; # what is the opening title you want to present to users (default) $openingTitle = "Search the gruntsco sf HTML archives"; # after searching, what do you want the title to be? (default) $closingTitle = "Search results from the gruntsco sf HTML archives"; # Use a table to look up title (and other info) to use on Search page? # If 1, then should supply filename for table. $use_Source_table = 1; $Source_table = "/usr/wais/wais-sources/Source_table"; # Specify the directory where your WWW docs reside -- same as in "kidofwais.pl" # (this is the same path you subtracted when you waisindexed (using -t url)) $wwwDocpath = "/b/httpd/doc/"; # Specify the url for this WWW server -- same as in "kidofwais.pl" script # (this is the same string you added when you waisindexed (using -t url)) $serverURL = "http://gruntsco.pdev.sco.com/"; # maximum number of hits to return $max_hits = 40; # Should we use the "highlighting script" for filetypes that "make sense" for # it? (1 is yes, 0 is no) $use_hilite = 0; # specify the www url to the "highlighting script" (used for .html and .txt) # whether this is used is controlled by $use_hilite flag. $hilite_script = "http://gruntsco.pdev.sco.com/cgi-bin/print_hit_bold.pl/"; # specify the "first hit anchor" to be used with hiliting $anchor = "#first_hit"; # who maintains this service? #$maintainer = "<A HREF=http://ewshp2.cso.uiuc.edu/index.html>Michael A. Grady</A> (m-grady@uiuc.edu)"; $maintainer = "<A HREF=http://gruntsco.pdev.sco.com/author/hiramc.html>Hiram Clawson (hiramc@sco.COM) x7519</A>"; # and when was it last modified. $modified = "12 May, 1995"; # you shouldn't have to edit anything below this line, except if you want to change the help text sub extractTitle { # try and get the <title> ... </title> field from file # only try to find it in the first 5 lines, and then give up local($fl) = @_; local($intitle) = 0; local($title) = "$theFile: No title, please let $maintainer know."; local($linenum) = 1; local($_); # read the file and extract the title # this is a combination of code from Eric Lease Morgan and Sean Dowd open (FP, "$fl") || return "File $theFile can't be read. Please contact $maintainer."; while (<FP>) { chop; last if ($linenum > 5); $linenum ++; if (/<TITLE\s?>(.*)<\/TITLE\s?>/i) { # all on one line $title = $1; last; } elsif (/<TITLE\s?>(.*)$/i) { # on multiple lines $title = $1; $intitle = 1; } elsif (/^(.*)<\/TITLE\s?>/i) { # finish of multiple lines $title = "$title$1"; $intitle = 0; last; } elsif ($intitle) { # add to title, and keep going $title = "$title$_"; } } close (FP); $title =~ s/^\s*//; # remove whitespace at front $title =~ s/\s*$//; # remove whitespace at end $title = "$theFile: Empty title, please let $maintainer know." unless $title; return $title; } # end sub extractTitle sub extractTableTitle { # Get file titles from table. Add a call to this # routine for any filetype that you decide to create titles for. # Currently, only the PDF type is set up to call this. (See the # subroutine &type_file). If nothing is found, it returns the current # $doc_title unchanged (which is the filename relative to doc root). # Table read into array at first reference to it. local ($fl) = @_; return $doc_title if ($file_title_table eq ""); local($_,$name_to_find,$table_entry,$filename,$filetitle); # Change the next line to "$name_to_find = $theFile" if you want to # use the whole path relative to doc root as the name to lookup in the # table. The current code uses just the filename w/o path info. $name_to_find = substr($fl, rindex($fl, '/') + 1); # "basename" if ($current_file_title_array ne $file_title_table) { # Read in new # table to array undef %title_array; # erase current array $current_file_title_array = $file_title_table; open (TABLE_TITLE, $file_title_table) || return $doc_title; while ($table_entry = <TABLE_TITLE>) { chop; next if $table_entry =~ /^\s*#/; # skip comments next if $table_entry =~ /^\s*$/; # skip blank lines ($filename, $filetitle) = split(/~/, $table_entry, 2); $title_array{$filename} = $filetitle; } close (TABLE_TITLE); } if ($filetitle = $title_array{$name_to_find}) { return $filetitle;} else { return $doc_title;} } sub send_index { print "Content-type: text/html\n\n"; print "<HEAD>\n<TITLE>$openingTitle</TITLE>\n<ISINDEX></HEAD>\n"; print "<BODY>\n<H2>", $openingTitle, "</H2>\n"; print ""; print "This is an index of some of the information on this server. "; print "To use this function, simply enter a query. "; print "Since this is a WAIS index, you can enter complex queries. For example:"; print "<DT>Right-hand truncation (stemming) queries"; print "<DD>The query 'astro*' will find documents containing the words"; print " 'astronomy' as well as 'astrophysics'."; print "<DT>Boolean 'And' queries"; print "<DD>The query 'red and blue' will find the intersection of all"; print " the documents containing the words 'red', and 'blue'."; print "The use of 'and' limits your retrieval."; print "<DT>Boolean 'Or' queries"; print "<DD>The query 'red or blue' will find the union of all the"; print " documents containing the words 'red' and 'blue'."; print "The use of 'or' increases your retrieval."; print "<DT>Boolean 'Not' queries"; print "<DD>The query 'red not green' will find all the documents containing"; print " the word 'red', and excluding the documents containing the word 'green'."; print "The use of 'not' limits your retrieval."; print "<DT>Nested Boolean queries"; print "<DD>The query '(red and green) or blue not pink' will find the union of all"; print " the documents containing the words 'red', and 'green'. It will then add (union)"; print " all documents containing the word 'blue'. Finally, it will exclude all documents"; print " containing the word 'pink'"; print "<HR>"; print "This page is maintained by $maintainer, and it was last modified on $modified.\n"; print "</BODY>\n"; } sub type_file { # Set file type based on file extension; also has the "side effect" of # modifying the $url_to_use if $use_hilite flag is turned on and the # file type is appropriate to use it. # You can add other types if you want very easily. Also, if you have # created a "file_title_table" and want it to be accessed for a # particular filetype, just add "$doc_title = &extractTableTitle ($_);" # to the appropriate do{} structure below. See the "pdf" file type # below for an example. local($filename) = @_; local($type) = ""; local($_); SUFFIX: for ($filename) { /\.html$/i && do { $type = "HTML file"; $doc_title = &extractTitle ($the_full_File); if ($use_hilite) { $url_to_use = $hilite_script . $filename . "?" . $query_plus . $anchor; } last SUFFIX; }; /\.te?xt$/i && do { $type = "text file"; if ($use_hilite) { $url_to_use = $hilite_script . $filename . "?" . $query_plus . $anchor; } last SUFFIX; }; /\.gif$/i && do { $type = "GIF graphic"; last SUFFIX; }; /\.ps$/i && do { $type = "PostScript file"; last SUFFIX; }; /\.pdf$/i && do { $type = "Acrobat PDF file"; # try to get a better file title $doc_title = &extractTableTitle ($_); last SUFFIX; }; /\.jpg$/i && do { $type = "JPEG graphic"; last SUFFIX; }; /\.mpg$/i && do { $type = "MPEG movie"; last SUFFIX; }; /\.Z$/i && do { $type = "compressed file"; last SUFFIX; }; /\.gz$/i && do { $type = "compressed file"; last SUFFIX; }; /\.au$/i && do { $type = "Sun audio file"; last SUFFIX; }; /\.hqx$/i && do { $type = "Binhex file"; last SUFFIX; }; /\.tar$/i && do { $type = "tar'red file"; last SUFFIX; }; $type = "Unknown type"; # "fall thru" default case } # end suffix return $type; } # end sub type_file sub byscores { $scores[$b] <=> $scores[$a];} # descending numeric sort routine sub print_it { # Print out the hit list, or if multiple sources, save all # the info to be printed in a array (where all the info for # "one" hit becomes one array element), only to be printed # once all sources are searched. This is so we can sort the # combined hit lists into descending order by score. local($line) = @_; if ($src_multiple) { if ($line eq "</DL>\n") { # All sources have been searched, we # can now sort and print the entire # hit list array. print @hit_list[ sort byscores $[..$#hit_list ]; print "$line"; } elsif ($line eq "</DD>\n") { # End of one "hit" listing, $output_hit .= $line; # save in the hit array. push ( @hit_list, $output_hit ); push ( @scores, $score ); $output_hit = ""; } else { # more stuff for the same hit $output_hit .= $line; } } else { print "$line"; } } sub do_wais { $src = $default_src; # if 'PATH_INFO' has a non-null value, then use it as the name of the # WAIS source to search, otherwise will default to $default_src. $path_extension = $ENV{'PATH_INFO'}; if ($path_extension =~ /^\/(.+)$/) { $src = $1; } @Sources = ( $src ); # Initialize array of sources to search if ( $use_Source_table ) { # Read in the Source table info into # associative array. open (INDEX_TITLES, $Source_table) || last; while (<INDEX_TITLES>) { chop; next if /^\s*#/; # skip comments next if /^\s*$/; # skip blank lines ($src_name, $remainder) = split(/~/, $_, 2); $src_array{$src_name} = $remainder; } close (INDEX_TITLES); ($src_title, $src_multiple, $src_prefix, $file_title_table, $sources) = split(/~/, $src_array{$src}); if ($src_title ne "") { $openingTitle = "Search of $src_title"; $closingTitle = "Search results from $src_title"; } $src_multiple && (@Sources = split(/,/, $sources)); # Store the sources } # to be searched. do { &send_index; return; } unless defined @ARGV; # No search terms yet. local(@query) = @ARGV; local($pquery) = join(" ", @query); # NCSA's HTTPD puts backslashes in front of "funny" or "dangerous" # characters in the input supplied thru argv. In the case of search terms # for WAIS, this can screw up the search (parens and "*" get backslashed # and then don't work correctly). So remove the backslashes, AND the # potentially "dangerous" characters ( ; ` ! ). $pquery =~ tr/!\;\`\\//d; # just in case, get rid of ;`! and \ @query = split(' ',$pquery); # and recreate query word array $query_plus = join("+", @query); print "Content-type: text/html\n\n"; # Start the "html" doc to be returned print "<HEAD>\n<TITLE>$closingTitle</TITLE>\n<ISINDEX></HEAD>\n"; print "<BODY>\n<H2>", $closingTitle, "</H2>\n"; print "Index \`$src\' contains the following\n"; print "items relevant to \`$pquery\':\n"; print "<DL>\n"; local($hits, $score, $headline, $lines, $bytes, $type, @types, $date); $DEBUG && do { open (LOG, ">>$debugLOG") || die "can't open log";}; foreach $src (@Sources) { # Search each indicated index for the terms ($src_prefix, $file_title_table) = (split(/~/, $src_array{$src}))[2,3]; open(WAISQ, "-|") || exec ($waisq, "-c", $waisd, "-m", $max_hits, "-f", "-", "-S", "$src.src", "-g", @query); while (<WAISQ>) { $DEBUG && print LOG $_; /:score\s+(\d+)/ && ($score = $1); /:number-of-lines\s+(\d+)/ && ($lines = $1); /:number-of-bytes\s+(\d+)/ && ($bytes = $1); /:type "(.*)"/ && (push (@types, $1)); /:headline "(.*)"/ && ($headline = $1); /:date "(\d+)"/ && ($date = $1, $hits++, &docdone); } close(WAISQ); $total_hits += $hits; $hits = 0; } &print_it ("</DL>\n"); # signal to print out hit array if we've been # building it (for multiple sources). print "<HR>"; print "This page is maintained by $maintainer."; if ($total_hits == 0) { print "Nothing found that matches your query.\n"; } print "</BODY>\n"; # End the "html" doc being returned } sub docdone { # Called for each "hit" returned by waisq local($endfile,$path_to_file,$file_proper,$file_ext,$multi_type,$alt_count); if ($headline =~ /Search produced no result/) { if ($src_multiple) { # don't print source listing if $hits--; # multi-index search } else { print "</DL><HR>"; print $headline, "\n<PRE>"; local ($count) = 0; open(WAISCAT, "$waisd/$src.cat") || die "$src.cat: $!"; while (<WAISCAT>) { last if ($count > 30); # limit to about 30 filenames $count ++; s#(Catalog for database:)\s+.*#$1 $src.src -- some of the files are:#; s#Headline:\s+(.*)#Headline: <A HREF="$1">$1</A>#; print; } close(WAISCAT); print "\n</PRE>\n"; } } elsif (($headline =~ /^Information on database:/) || ($headline =~ /^Catalog for database:/)) { $hits--; } else { # this is a "real" hit ($endfile, $path_to_file) = split(' ', $headline); if ($path_to_file ne "") { # Not indexed with -t url, so headline of # form: "filename /path/to/file/" # Multitype indexed files will probably have this form, as at # least I can't get "-t url" to coexist with "-M type,type" $the_full_File = $theFile = $path_to_file.$endfile; $theFile =~ s/^.*$wwwDocpath//i; $url_to_use = $serverURL.$theFile; } else { # should have been indexed as "-t url", so headline of form # http://your_server_url/path/to/actual/file # get the string to munge $theFile = $url_to_use = $headline; # parse out the file name (remove the server URL from the front) $theFile =~ s/^.*$serverURL//i; # concatenate the "wwwDocpath" variable with the file name $the_full_File = $wwwDocpath.$theFile; } $last_period = rindex($theFile, "."); # need filename without .ext if if ($last_period > 0) { # it turns out to be multitype $file_proper = substr($theFile, 0, $last_period); $file_ext = substr($theFile, $last_period + 1); } $doc_title = $theFile; $type = &type_file ($theFile); # also modifies $url_to_use if flag # $use_hilite is set and right filetype $src_multiple && ($doc_title = "$src_prefix $doc_title"); if ($bytes < 1000) { $calc_bytes = "< 1 Kbyte"; } else { $calc_bytes = int(($bytes + 500)/1000) . " Kbytes"; } &print_it ("<DT><A HREF=\"$url_to_use\">$doc_title</A></DT>\n"); &print_it ("<DD>Relevancy score: $score , Size: $calc_bytes , Type: $type\n"); if (($#types > 0) && ($file_proper ne "")) { # Multitype indexing $alt_count = 0; # offer alternatives foreach $multi_type (@types) { next if $multi_type eq $file_ext; # skip, already listed &print_it ("</DD><DD>... Alternate Types Available: ") if $alt_count == 0; $alt_count ++; $theFile = "$file_proper.$multi_type"; $url_to_use = $serverURL.$theFile; $type = &type_file ($theFile); &print_it ("<A HREF=\"$url_to_use\">$type</A>, "); } # end foreach $multi_type } &print_it ("</DD>\n"); } $score = $headline = $lines = $bytes = $type = $date = ''; $file_proper = $file_ext = ''; @types = (); } open (STDERR,"> /dev/null"); eval '&do_wais';